package com.guiji.quartz.domain;

import cn.hutool.core.util.RandomUtil;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpResponse;
import cn.wanghaomiao.xpath.model.JXDocument;
import com.guiji.quartz.config.ProxyConfig;
import com.guiji.quartz.task.DataSaveUtil;
import com.guiji.quartz.util.DelPicUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.text.SimpleDateFormat;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @program: cms-vue-plus
 * @description:
 * @author: gaoX
 * @date 2021/11/18 16:16
 */
public class Beincrypto {
	private static final Logger log = LoggerFactory.getLogger(Beincrypto.class);

	public static void mainMethod() throws Exception {
		List<String> preurls = new ArrayList<>();
		preurls.add("https://beincrypto.com/news/");
		preurls.add("https://beincrypto.com/opinion/");
		preurls.add("https://beincrypto.com/bitcoin-news/");
		preurls.add("https://beincrypto.com/altcoin-news/");
		Set<String> resUrl = getUrl(preurls);
		getContent(resUrl);

	}

	private static void getContent(Set<String> listContent) {
		for (String url : listContent) {
			log.info("getContent执行到-{},listContent长度-{}", url, listContent.size());
			String doc;
			JXDocument jxDocument;
			try {
				HttpResponse response = HttpRequest.get(url).setHttpProxy(ProxyConfig.url, ProxyConfig.ip).setConnectionTimeout(15000).execute();
				doc = response.body();
				jxDocument = new JXDocument(doc);
			} catch (Exception e) {
				log.info("代理失败2-investing" + e);
				continue;
			}
			String title = null;
			try {
				title = (String) jxDocument.sel("//h1//text()").get(0);
			} catch (Exception e) {
				log.error("标题错误-------------" + url + "-------------------标题错误");
				continue;
			}
			Document document = null;
			StringBuffer content = new StringBuffer();
			try {
				jxDocument.sel("//div[@class='entry-content-inner']/p").forEach(s -> {
					JXDocument jxDocument1 = new JXDocument(s.toString());
					try {
						content.append(jxDocument1.sel("//text()").get(0)+"$$$");
					} catch (Exception e) {
					}
				});
			} catch (Exception e) {
				continue;
			}

			List<Object> newImgs = new ArrayList<>();
			try {
				newImgs = jxDocument.sel("//div[@class='featured-images mb-3']//amp-img/@src");
			} catch (Exception e) {
				log.error("获取图片失败");
			}

			Date date = new Date();

			if (content.length() > 50000 || content.toString().replaceAll("\\$|[\\s\\p{Zs}]", "").length() < 20 || newImgs.size() > 20) {
				continue;
			}
			try {
				int i = RandomUtil.randomInt(1, 3);
				DataSaveUtil.saveData(title, content.toString(), newImgs, url, date, "https://beincrypto.com/", "Beincrypto-Cryptocurrency-"+i);
				log.info(Thread.currentThread().getName() + "-----------------Beincrypto-Cryptocurrency----------------");
			} catch (Exception e) {
				continue;
			}

		}
	}

	private static Set<String> getUrl(List<String> preurls) {
		//   "url":"https:\/\/beincrypto.com\/senator-ted-cruz-repeals-bipartisan-tax-bills-effect-on-crypto-industry\/"
		Set<String> res = new HashSet<>();
		String pattern = "https:\\\\/\\\\/beincrypto.com\\\\/.{25,70}?/";
		for (String url:preurls){
			HttpResponse response = null;
			try {
				response = HttpRequest.get(url)
					.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
					.header("Accept-Encoding", "gzip, deflate")
					.setConnectionTimeout(30000)
					.execute();
			} catch (Exception e) {
				log.error("代理失败-beincrypto" + e);
				continue;
			}
			String resp = response.body();

			Pattern p = Pattern.compile(pattern);
			Matcher m = p.matcher(resp);
			while (m.find()) {
				if(m.group().contains("-")&&!m.group().contains("uploads")&&!m.group().contains("author")&&!m.group().contains("\"")){
					res.add(m.group().replaceAll("\\\\",""));
				}
			}
		}
		return res;
	}

//	public static void main(String[] args) throws Exception {
//		Set set = new HashSet();
//		set.add("https://beincrypto.com/senator-ted-cruz-repeals-bipartisan-tax-bills-effect-on-crypto-industry/");
//		getContent(set);
//	}
}
